BindingSiteSet.txt

Releases

Comparison of the BindingSiteSet.txt file between the last releases.

Release version Date
10.6 2019 July
10.6.3 -
10.7 2020 April
10.8 2020 October
10.9 2021 April
10.10 2022 February
11.0 2022 August
11.0.1 (not public) 2022 September
11.0.2 (not public) 2022 September

Notes:

  • V10.6 excluded because it doesn’t include TU_ID, only TU names which are not unique; otherwise it’s almost the same as v10.7 which is included
  • Some formatting is done in order to uniformize columns accross versions (uppercase/lowercase in confidence levels, word or symbol +- for strand and effect, etc)
  • Starting v11.0.1 there are 2 evidence columns (site evidence and function evidence), here I merged them so I can compare with older versions
dir_releases <- "_Databases/RegulonDB/releases"

## V10.6 excluded because it doesn't include TU_ID, only TU names which are not unique; otherwise it's almost the same as v10.7 which is included
## Some formatting is done in order to uniformize columns like strand and confidence (uppercase/lowercase, word or symbol +- for the strand, etc)
## Starting v11.0.1 there are 2 evidence columns, here I just merge them so I can compare with older versions

dir_versions <- c("10.7", "10.8", "10.9", "10.10", "11.0", "11.0.1", "11.0.2") 

tfbs_sets <- list()
tfbs_versions <- c()
for(v in dir_versions){
  version_tag <- paste0("v", v)
  set <- read.delim(paste0(dir_releases, "/", v, "/BindingSiteSet.tsv"), 
                    comment.char = "#", header = T, stringsAsFactors = F, na.strings = c("", "NA")) %>% 
    dplyr::mutate(version = version_tag) %>%
    dplyr::mutate(strand = ifelse(strand == "reverse", "-", ifelse(strand == "forward", "+", NA))) %>%
    dplyr::mutate(confidence = tolower(confidence)) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(evidence = ifelse("evidence_function" %in% colnames(.), concat_uniq2(evidence, evidence_function), evidence)) %>%
    dplyr::mutate(coords = paste0(start, "_", stop)) 
  
  assign(paste0("tfbs_set_", version_tag), set)
  tfbs_sets[[version_tag]] <- set
  tfbs_versions <- c(tfbs_versions, version_tag)
  
}
all_tfbs <- bind_rows(tfbs_sets) %>%
  dplyr::mutate(version = factor(version, levels =  tfbs_versions)) %>%
  dplyr::mutate(effect = factor(effect, levels =  c("+", "-", "?"))) %>%
  dplyr::mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed")))

all_tfbs_by_evidence <- all_tfbs %>%
  tidyr::separate_rows(evidence, sep = ",") %>%
  dplyr::mutate(evidence = gsub("\\[|\\]", "", evidence)) %>% #,
  tidyr::separate(evidence, c("evidence_code", "evidence_level", "evidence_name"), sep = "\\|") 

Overall number of TFBSs

tfbs_summary <- all_tfbs %>%
  dplyr::group_by(version) %>%
  dplyr::summarise(total = n()) %>%
  dplyr::arrange(version)

TFBS_num <- simple_bar(tfbs_summary, "version", "total") +
  scale_fill_viridis(discrete = T) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "Release version", y = "Number of TFBSs", title = "")

TFBS_num

DT::datatable(tfbs_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

Comments:

  • Sudden increase with 488 new entries in version 10.9

TFBS IDs

TFBS IDs shared between versions

tfbs_ids <- list()
for (v in tfbs_versions) {
    tfbs_ids[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TFBS_ID)
}
UpSetR::upset(fromList(tfbs_ids), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              queries = list(list(query = intersects, params = list("v10.7"), color = "red", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8"), color = "red", active = T),
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T))
              )

Comments:

  • The oldest versions have less entries but more unique TFBS IDs
  • 421 TFBS IDs disappear from version 10.8 until now (red bar)
  • There are only 21 new IDs present in version 10.9 to 11.0.2 despite the sudden increase in total entries (blue bar)

Duplicated TFBS IDs

tfbs_ids_dupli <- all_tfbs %>%
  dplyr::group_by(version, TFBS_ID) %>%
  dplyr::summarise(occurrences = n()) %>%
      group_by(version, occurrences) %>% 
      summarise(tfbs_number = n())%>%
  dplyr::mutate(occurrences = factor(occurrences))

###
dodge <- ggplot(tfbs_ids_dupli, aes(fill = occurrences, y = tfbs_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "", y = "Number of unique TFBS IDs", title = "TFBS ID duplication in BindingSiteSet.txt across versions")

dodge2 <- ggplot(tfbs_ids_dupli %>% dplyr::filter(!occurrences %in% c("1", "2", "3", "4")), aes(fill = occurrences, y = tfbs_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14),
        legend.position = "none") +
  labs(x = "", y = "Number of unique TFBS IDs", title = "...minimum 5 copies")

dodge / dodge2 

Which TFBS IDs are most duplicated?

## TFBS IDs that have at least 10 entries in a given version
tfbs_ids_dupli_max <- all_tfbs %>%
  dplyr::group_by(version, TFBS_ID) %>%
  dplyr::summarise(occurrences = n()) %>%
  dplyr::filter(occurrences >=10) %>%
  dplyr::arrange(desc(occurrences)) %>%
  pivot_wider(names_from = version,
              values_from = c(occurrences))

DT::datatable(tfbs_ids_dupli_max, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

Coordinates

coordinates <- list()
for (v in tfbs_versions) {
    coordinates[[v]] <- unique( (get(paste0("tfbs_set_", v)))$coords)
}
UpSetR::upset(fromList(coordinates), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              queries = list(
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8"), color = "red", active = T)
                             )
              )

coords_dupli <- all_tfbs %>%
  dplyr::group_by(version, coords) %>%
  dplyr::summarise(occurrences = n()) %>%
      group_by(version, occurrences) %>% 
      summarise(coords_number = n())%>%
  dplyr::mutate(occurrences = factor(occurrences))

###
dodge <- ggplot(coords_dupli, aes(fill = occurrences, y = coords_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "", y = "Number of unique TFBS IDs", title = "TFBS ID duplication in BindingSiteSet.txt across versions")

dodge2 <- ggplot(coords_dupli %>% dplyr::filter(!occurrences %in% c("1", "2", "3", "4")), aes(fill = occurrences, y = coords_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14),
        legend.position = "none") +
  labs(x = "", y = "Number of unique TFBS IDs", title = "...minimum 5 copies")

dodge / dodge2 

Which coordinates are most duplicated?

## coords that have at least 10 entries in a given version
coords_dupli_max <- all_tfbs %>%
  dplyr::group_by(version, coords) %>%
  dplyr::summarise(occurrences = n()) %>%
  dplyr::filter(occurrences >=10) %>%
  dplyr::arrange(desc(occurrences)) %>%
  pivot_wider(names_from = version,
              values_from = c(occurrences))

DT::datatable(coords_dupli_max, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

Distance to TSS

ggplot(all_tfbs, aes(x = distance_TSS, y = version, fill = version)) +
  ggridges::geom_density_ridges(color = "white") +
  ggridges::theme_ridges() + 
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme(legend.position = "none") +
  xlim(-1000, 1000)

Distance to first gene

ggplot(all_tfbs, aes(x = distance_gene, y = version, fill = version)) +
  ggridges::geom_density_ridges(color = "white") +
  ggridges::theme_ridges() + 
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme(legend.position = "none") +
  xlim(-1000, 1000)

Effect

tfbs_effect_long <- all_tfbs %>% 
  group_by(version, effect) %>% 
  summarise(value = n()) 

dodge <- ggplot(tfbs_effect_long, aes(fill = effect, y = value, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of TFBS", title = "")

stack <- ggplot(tfbs_effect_long, aes(fill = effect, y = value, x = version)) + 
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of TFBS", title = "")

line <-  ggplot(tfbs_effect_long, aes(group = effect, y = value, x = version)) + 
  geom_line(aes(color = effect)) +
  geom_point(size = 2, aes(color = effect)) +
  scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "Release version", y = "Number of TFBS", title = "")

## ggplotly to make it interactive
# fig0 <- ggplotly(TFBS_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)

subplot(fig1, fig2, fig3, nrows=3)
effect_summary <- all_tfbs %>% 
  group_by(version, effect) %>% 
  summarise(value = n())  %>%
  data.frame() %>%
  pivot_wider(names_from = version, values_from = c(value)) %>%
   arrange(effect)

DT::datatable(effect_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

Evidence

tfbs_evidence_long <- all_tfbs_by_evidence %>% 
  group_by(version, evidence_code, evidence_name) %>% 
  summarise(value = n()) 

evidence_palette <- random_palette(length(unique(all_tfbs_by_evidence$evidence_code)))

##----
dodge <- ggplot(tfbs_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_name)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
    scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
  # scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of TFBSs", title = "")

stack <- ggplot(tfbs_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_name)) + 
  geom_bar(position = "stack", stat = "identity") +
    scale_fill_manual(values = evidence_palette, drop = F, na.value = "gray") +
  # scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of evidence", title = "")

line <-  ggplot(tfbs_evidence_long, aes(group = evidence_name, y = value, x = version)) + 
  geom_line(aes(color = evidence_code)) +
  scale_color_manual(values = evidence_palette, drop = F, na.value = "gray") +
  geom_point(size = 2, aes(color = evidence_code)) +
  # scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "Release version", y = "Number of TFBS", title = "")

fig0 <- ggplotly(TFBS_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)

subplot(fig0, fig1, fig2, fig3, nrows=4)
tfbs_evidence_table <- all_tfbs_by_evidence %>% 
  group_by(evidence_code, evidence_name) %>% 
  summarise(version = concat_uniq(version)) 

DT::datatable(tfbs_evidence_table, rownames= FALSE, options = list(searching = TRUE, lengthChange = FALSE, pageLength = 10))
tfbs_evidence_shared <- list()
for (v in tfbs_versions) {
    tfbs_evidence_shared[[v]] <- unique((all_tfbs_by_evidence %>% dplyr::filter(version == v))$evidence_code)
}

UpSetR::upset(fromList(tfbs_evidence_shared), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              # queries = list(list(query = intersects, params = list("v10.7"), color = "red", active = T),
              #                list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T))
              )

Confidence

tfbs_confidence_long <- all_tfbs %>% 
  group_by(version, confidence) %>% 
  summarise(value = n()) 

dodge <- ggplot(tfbs_confidence_long, aes(fill = confidence, y = value, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of TFBS", title = "")

stack <- ggplot(tfbs_confidence_long, aes(fill = confidence, y = value, x = version)) + 
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of TFBS", title = "")

line <-  ggplot(tfbs_confidence_long, aes(group = confidence, y = value, x = version)) + 
  geom_line(aes(color = confidence)) +
  geom_point(size = 2, aes(color = confidence)) +
  scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "Release version", y = "Number of TFBS", title = "")

## ggplotly to make it interactive
# fig0 <- ggplotly(TFBS_num)
fig1 <- ggplotly(dodge)
fig2 <- ggplotly(stack)
fig3 <- ggplotly(line)

subplot(fig1, fig2, fig3, nrows=3)
confidence_summary <- all_tfbs %>% 
  group_by(version, confidence) %>% 
  summarise(value = n())  %>%
  mutate(confidence = ifelse(is.na(confidence), "null", as.character(confidence))) %>%
  data.frame() %>%
  pivot_wider(names_from = version, values_from = c(value)) %>%
  mutate(across(starts_with('v'), ~replace_na(.,0))) %>%
  bind_rows(summarise(.,
                      across(where(is.numeric), sum),
                      across(where(is.character), ~"total"))) %>%
   mutate(confidence = factor(confidence, levels = c("weak", "strong", "confirmed", "null", "total"))) %>%
   arrange(confidence)

DT::datatable(confidence_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

Comments:

  • Version 10.8 seem to have too many confirmed TFBSs
  • Version 10.9 loses most confidence information
  • Version 10.10 and 11.0 have mostly weak sites, and many NAs only function evidence is taken into account (not site)
  • Version 11.0.1 does not have confirmed TFBSs only site evidence is taken into account (not function)
  • Version 11.0.2 seems about right?

RI IDs

ri_ids <- list()
for (v in tfbs_versions) {
    ri_ids[[v]] <- unique( (get(paste0("tfbs_set_", v)))$RI_ID)
}
UpSetR::upset(fromList(ri_ids), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              queries = list(
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8", "v10.9", "v10.10"), color = "red", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8"), color = "red", active = T)
)
              )

Duplicate RI IDs

ris_ids_dupli <- all_tfbs %>%
  dplyr::group_by(version, RI_ID) %>%
  dplyr::summarise(occurrences = n()) %>%
      group_by(version, occurrences) %>% 
      summarise(ris_number = n())%>%
  dplyr::mutate(occurrences = factor(occurrences))

###
dodge <- ggplot(ris_ids_dupli, aes(fill = occurrences, y = ris_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "", y = "Number of unique RI IDs", title = "RI ID duplication in BindingSiteSet.txt across versions")

dodge2 <- ggplot(ris_ids_dupli %>% dplyr::filter(!occurrences %in% c("1")), aes(fill = occurrences, y = ris_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14),
        legend.position = "none") +
  labs(x = "", y = "Number of unique RI IDs", title = "...minimum 2 copies")

dodge / dodge2 

TFs

tf_ids <- list()
for (v in tfbs_versions) {
    tf_ids[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TF_ID)
}
UpSetR::upset(fromList(tf_ids), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique TF IDs",
              queries = list(
                             list(query = intersects, params = list("v10.8", "v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T),
                             list(query = intersects, params = list("v10.7"), color = "red", active = T))
              )

tf_ids_gone <- rownames(fromList(tf_ids) %>% filter(v10.7 == 1 & v11.0.2 == 0))

tf_names <- list()
for (v in tfbs_versions) {
    tf_names[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TF_name)
}
UpSetR::upset(fromList(tf_names), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique TF names",
              queries = list(
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
              )

tf_names_gone <- rownames(fromList(tf_names) %>% filter(v10.7 == 1 & v11.0.2 == 0))
  • 5 TF ID disappear: ECK125257186, ECK120048948, ECK125257190, ECK120023539, ECK125257191 (red)
  • 2 TF names disappear: HigBA, YiaJ (red)

Promoters

promoter_name <- list()
for (v in tfbs_versions) {
    promoter_name[[v]] <- unique( (get(paste0("tfbs_set_", v)))$promoter)
}
UpSetR::upset(fromList(promoter_name), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique promoter names",
              queries = list(
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
              )

TUs

tu_ids <- list()
for (v in tfbs_versions) {
    tu_ids[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TU_ID)
}
UpSetR::upset(fromList(tu_ids), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique TU IDs",
              queries = list(
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
              )

tu_names <- list()
for (v in tfbs_versions) {
    tu_names[[v]] <- unique( (get(paste0("tfbs_set_", v)))$TU_name)
}
UpSetR::upset(fromList(tu_names), sets = tfbs_versions, order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              sets.x.label  = "Number of unique TU names",
              queries = list(
                             list(query = intersects, params = list("v10.8", "v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T),
                             list(query = intersects, params = list("v10.7", "v10.8", "v10.9"), color = "red", active = T))
              )

Mapping v10.7 vs v11.0.2

Notes:

  • TFBSs from v10.7 and v11.0.2 are mapped by TFBS_ID and promoter name
  • The following table displays the differences

Many (most?) differences seem to be caused by different promoter names

Ex.

  • in 10.7 micFp is regulated by Lrp (TFBS ID ECK120011644)
  • in 11.0.2 micFp1 and micFp2 are both regulated by the same TF and TFBS, and are linked to 2 distinct TUs (both TUs are named micF but have distinct IDs)
  • all three entries have the same evidence
## Join versions 10.7 and 11.0.2 by TFBS_ID and promoter name
tfbs_join_107_1102 <- tfbs_set_v10.7 %>%
  dplyr::full_join(tfbs_set_v11.0.2, by = c("TFBS_ID", "promoter"), suffix = c("_10.7", "_11.0.2")) %>%
  dplyr::arrange(TFBS_ID) %>%
  dplyr::select(TFBS_ID, promoter, starts_with("TF_name"), starts_with("TU"), starts_with("coords"), starts_with("evidence_1"), starts_with("confidence"))
  # select(TFBS_ID, promoter, everything())


tfbs_matches_107_1102 <- tfbs_join_107_1102 %>% na.omit

tfbs_differences_107_1102 <- dplyr::setdiff(tfbs_join_107_1102, tfbs_matches_107_1102)

write.table(tfbs_join_107_1102, file = "TFBS_full_join_107_1102.tsv", quote = F, row.names = F, col.names = T, sep = "\t")
write.table(tfbs_differences_107_1102, file = "TFBS_differences_107_1102.tsv", quote = F, row.names = F, col.names = T, sep = "\t")


## display selection of columns
# tfbs_differences_107_1102_simple <- tfbs_differences_107_1102 %>%
#   dplyr::select(TFBS_ID, promoter, starts_with("TF_name"), starts_with("TU"), starts_with("coords"), starts_with("evidence_1"), starts_with("confidence"))

DT::datatable(tfbs_differences_107_1102, rownames= FALSE, options = list(searching = TRUE, lengthChange = FALSE, pageLength = 5,
                                                                                columnDefs = list(list(width = '200px', targets = c(11,12)))
                                                                                ))
# all_tfbs_RI_TU <- tfbs_set_v10.7 %>%
  # dplyr::full_join(tfbs_set_v10.6.3 %>% dplyr::select(TFBS_ID, TF_name, RI_ID, TU_name, evidence, confidence), 
  #                  by = c("TFBS_ID", "TF_name", "RI_ID", "TU_name"), suffix = c("_10.6", "_10.6.3")) %>%
  # dplyr::full_join(tfbs_set_v10.7 %>% dplyr::select(TFBS_ID, TF_name, RI_ID, TU_name, evidence, confidence), 
  #                  by = c("TFBS_ID", "TF_name", "RI_ID", "TU_name"), suffix = c("_10.6.3", "_10.7")) %>%
  # dplyr::full_join(tfbs_set_v10.8 %>% dplyr::select(TFBS_ID, TF_name, RI_ID, TU_name, evidence, confidence), 
  #                  by = c("TFBS_ID", "TF_name", "RI_ID", "TU_name"), suffix = c("_10.7", "_10.8")) %>%
  # dplyr::full_join(tfbs_set_v10.9 %>% dplyr::select(TFBS_ID, TF_name, RI_ID, TU_name, evidence, confidence), 
  #                  by = c("TFBS_ID", "TF_name", "RI_ID", "TU_name"), suffix = c("_10.8", "_10.9")) %>%
  # dplyr::full_join(tfbs_set_v10.10 %>% dplyr::select(TFBS_ID, TF_name, RI_ID, TU_name, evidence, confidence), 
  #                  by = c("TFBS_ID", "TF_name", "RI_ID", "TU_name"), suffix = c("_10.9", "_10.10")) %>%
  # dplyr::full_join(tfbs_set_v11.0 %>% dplyr::select(TFBS_ID, TF_name, RI_ID, TU_name, evidence, confidence), 
  #                  by = c("TFBS_ID", "TF_name", "RI_ID", "TU_name"), suffix = c("_10.10", "_11.0")) %>%
  # dplyr::full_join(tfbs_set_v11.0.1 %>% dplyr::select(TFBS_ID, TF_name, RI_ID, TU_name, evidence, evidence_function, confidence), 
  #                  by = c("TFBS_ID", "TF_name", "RI_ID", "TU_name"), suffix = c("_11.0", "_11.0.1")) %>%
  # dplyr::full_join(tfbs_set_v11.0.2 %>% dplyr::select(TFBS_ID, TF_name, RI_ID, TU_name, evidence, evidence_function, confidence), 
  #                  by = c("TFBS_ID", "TF_name", "RI_ID", "TU_name"), suffix = c("_11.0.1", "_11.0.2")) %>%
  # dplyr::arrange(TFBS_ID) %>%
  # dplyr::select(-conformation_name, -TF_ID) %>%
  # dplyr::rename(confidence_11.0.2 = confidence, evidence_11.0.2 = evidence)
tfbs_new_from_10.9 <- all_tfbs_RI_TU %>%
  dplyr::filter(is.na(evidence_10.6) & is.na(confidence_10.6) & 
                  is.na(evidence_10.6.3) & is.na(confidence_10.6.3) & 
                  is.na(evidence_10.7) & is.na(confidence_10.7) & 
                  is.na(evidence_10.8) & is.na(confidence_10.8))

write.table(tfbs_new_from_10.9, file = "TFBS_new_10.9.tsv", quote = F, row.names = F, col.names = T, sep = "\t")

# DT::datatable(tfbs_new_from_10.9, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 5))
tfbs_confirmed_10.8 <- all_tfbs_RI_TU %>%
  dplyr::filter(confidence_10.8 == "Confirmed")

write.table(tfbs_confirmed_10.8, file = "TFBS_confirmed_10.8.tsv", quote = F, row.names = F, col.names = T, sep = "\t")

# DT::datatable(tfbs_confirmed_10.8, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 5))

network_tf_gene.txt

Data

versions_nw <- setdiff(dir_versions, c("10.6", "10.10", "11.0.2"))
for(v in versions_nw){
  assign(paste0("network_set_", v), read.delim(paste0(dir_releases, "/", v, "/network_tf_gene.tsv"), comment.char = "#", header = T, stringsAsFactors = F, na.strings = "") %>%
           dplyr::mutate(version = paste0("v", v)) #%>%
           # dplyr::mutate(ifelse(effect == ))
  )
}

NB: additional TAB characters at the end of each line cause parsing issues

tf_gene_summary <- list()
for(v in versions_nw){
    tf_gene_summary[[paste0("v", v)]] <- data.frame(version = paste0("v", v), count = nrow(get(paste0("network_set_", v))))
}
tf_gene_summary_df <- data.table::rbindlist(tf_gene_summary)
tf_gene_summary_df$version <- factor(tf_gene_summary_df$version, levels = paste0("v", versions_nw))


DT::datatable(tf_gene_summary_df, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))
tf_gene_num <- simple_bar(tf_gene_summary_df, "version", "count") +
  scale_fill_viridis(discrete = T) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "Release version", y = "Number of TFBSs", title = "")

tf_gene_num

TF-gene pairs

tf_gene_pairs <- list()
for (v in versions_nw) {
  pairs <- get(paste0("network_set_", v)) %>%
    rowwise() %>%
    mutate(TF_gene = paste0(TF_name, "_", gene_name))
    tf_gene_pairs[[paste0("v", v)]] <- unique(pairs$TF_gene)
}
UpSetR::upset(fromList(tf_gene_pairs), sets = paste0("v", versions_nw), order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              # queries = list(list(query = intersects, params = list("v10.6", "v10.6.3", "v10.7"), color = "red", active = T),
              #                list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T))
              )

Duplicate TF-gene pairs

pairs_ids_dupli <- list()
for (v in versions_nw) {
    pairs_ids_dupli[[paste0("v", v)]] <- get(paste0("network_set_", v)) %>% 
    rowwise() %>%
    mutate(TF_gene = paste0(TF_name, "_", gene_name)) %>%
      group_by(TF_gene) %>% 
      summarise(occurrences = n()) %>% 
      group_by(occurrences) %>% 
      summarise(pairs_number = n()) %>%
      mutate(version = paste0("v", v))
}

pairs_ids_dupli_df <- data.table::rbindlist(pairs_ids_dupli) %>%
  dplyr::mutate(occurrences = factor(occurrences))

pairs_ids_dupli_df$version <- factor(pairs_ids_dupli_df$version, levels = paste0("v", versions_nw))

###
dodge <- ggplot(pairs_ids_dupli_df, aes(fill = occurrences, y = pairs_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "", y = "Number of pairs", title = "TF-gene pairs duplication in network_tf_gene.txt across versions")

dodge2 <- ggplot(pairs_ids_dupli_df %>% dplyr::filter(occurrences != 1), aes(fill = occurrences, y = pairs_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14),
        legend.position = "none") +
  labs(x = "", y = "Number of pairs", title = "...minimum 2 copies")


dodge / dodge2 

Evidence in network TF-gene

NB: evidence is formatted in different ways between versions of downloadable file, which can cause parsing issues

# tf_gene_evidence <- bind_rows(network_set_10.6.3, network_set_10.7, network_set_10.8, network_set_10.9, network_set_11.0) %>%
#   tidyr::separate_rows(evidence, sep = ",") %>%
#   dplyr::mutate(evidence_code = gsub("\\[|\\]", "", evidence)) 
# 
# last <- network_set_11.0.1 %>%
#   dplyr::mutate(evidence_function = gsub("\\[\\]", "\\[\\|\\|\\]", evidence_function)) %>%
#   dplyr::mutate(evidence = gsub("\\[\\]", "\\[\\|\\|\\]", evidence)) %>%
#   dplyr::rowwise() %>%
#   dplyr::mutate(evidence = concat_uniq(evidence, evidence_function)) %>%
#   tidyr::separate_rows(evidence, sep = ",") %>%
#   dplyr::mutate(evidence = gsub("\\[|\\]", "", evidence)) %>%
#   tidyr::separate(evidence, c("evidence_code", "evidence_level", "evidence_name"), sep = "\\|") 
#   # dplyr::mutate(fevidence = gsub("\\[|\\]", "", evidence_function)) %>%
#   # tidyr::separate(fevidence, c("fevidence_code", "fevidence_level", "fevidence_name"), sep = "\\|")
# 
# tf_gene_evidence <- bind_rows(tf_gene_evidence, last)
# tf_gene_evidence$version <- factor(tf_gene_evidence$version, levels = paste0("v", versions_nw))


tf_gene_evidence <- bind_rows(network_set_10.7, network_set_10.8, network_set_10.9, network_set_11.0) %>%
  tidyr::separate_rows(evidence, sep = ",") %>%
  dplyr::mutate(evidence_code = gsub("\\[|\\]", "", evidence)) 

last <- network_set_11.0.1 %>%
  dplyr::mutate(evidence_function = gsub("\\[\\]", "\\[\\|\\|\\]", evidence_function)) %>%
  dplyr::mutate(evidence = gsub("\\[\\]", "\\[\\|\\|\\]", evidence)) %>%
  dplyr::rowwise() %>%
  dplyr::mutate(evidence = paste0(evidence, ",", evidence_function)) %>%
  tidyr::separate_rows(evidence, sep = ",") %>%
  dplyr::mutate(evidence = gsub("\\[|\\]", "", evidence)) %>%
  tidyr::separate(evidence, c("evidence_code", "evidence_level", "evidence_name"), sep = "\\|") 
  # dplyr::mutate(fevidence = gsub("\\[|\\]", "", evidence_function)) %>%
  # tidyr::separate(fevidence, c("fevidence_code", "fevidence_level", "fevidence_name"), sep = "\\|")

tf_gene_evidence <- bind_rows(tf_gene_evidence, last)%>%
  dplyr::mutate(evidence_code = trimws(evidence_code)) 

tf_gene_evidence$version <- factor(tf_gene_evidence$version, levels = paste0("v", versions_nw))

#
##
tf_gene_evidence_long <- tf_gene_evidence %>% 
  group_by(version, evidence_code) %>% 
  summarise(value = n()) 

dodge <- ggplot(tf_gene_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_code)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of entries", title = "")

stack <- ggplot(tf_gene_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_code)) + 
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of evidence", title = "")

line <-  ggplot(tf_gene_evidence_long, aes(y = value, x = version, group = evidence_code)) + 
  geom_line(aes(color = evidence_code)) +
  geom_point(size = 2, aes(color = evidence_code)) +
  scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "Release version", y = "Number of entries", title = "")

fig0 <- ggplotly(tf_gene_num)
fig1<-ggplotly(dodge)
fig2<-ggplotly(stack)
fig3<-ggplotly(line)

subplot(fig0, fig1, fig2, fig3, nrows=4, shareY = TRUE)

Confidence

NB: confidence is now written in the downloadable file in all caps, which can cause mapping issues

confidence_nw_all_versions <- list()
for(v in versions_nw){
  confidence_nw_all_versions[[v]] <- get(paste0("network_set_", v)) %>% 
    group_by(version, confidence) %>% 
    summarise(value = n())
}
confidence_nw_all_versions_df <- data.table::rbindlist(confidence_nw_all_versions) %>%
  dplyr::mutate(confidence = tolower(confidence)) 
confidence_nw_all_versions_df$confidence <- factor(confidence_nw_all_versions_df$confidence, levels = c("weak", "strong", "confirmed"))
confidence_nw_all_versions_df$version <- factor(confidence_nw_all_versions_df$version, levels = paste0("v", versions_nw))

##
tf_gene_confidence_long <- confidence_nw_all_versions_df

dodge <- ggplot(tf_gene_confidence_long, aes(fill = confidence, y = value, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of entries", title = "")

stack <- ggplot(tf_gene_confidence_long, aes(fill = confidence, y = value, x = version)) + 
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of TFBS", title = "")

line <-  ggplot(tf_gene_confidence_long, aes(group = confidence, y = value, x = version)) + 
  geom_line(aes(color = confidence)) +
  geom_point(size = 2, aes(color = confidence)) +
  scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "Release version", y = "Number of entries", title = "")

fig0 <- ggplotly(tf_gene_num)
fig1<-ggplotly(dodge)
fig2<-ggplotly(stack)
fig3<-ggplotly(line)

subplot(fig0, fig1, fig2, fig3, nrows=4, shareY = T)
confidence_nw_summary <- confidence_nw_all_versions_df %>%
  arrange(confidence) %>%
  mutate(confidence = ifelse(is.na(confidence), "null", as.character(confidence))) %>%
  # mutate(value = ifelse(is.na(value), 0, value)) %>%
  data.frame() %>%
  pivot_wider(names_from = version, values_from = c(value)) %>%
  mutate(across(starts_with('v'), ~replace_na(.,0))) %>%
  bind_rows(summarise(.,
                      across(where(is.numeric), sum),
                      across(where(is.character), ~"total")))

DT::datatable(confidence_nw_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

Mapping

all_tf_gene <- network_set_10.6.3 %>%
  dplyr::full_join(network_set_10.7 %>% dplyr::select(TF_name, gene_name, effect, evidence, confidence), 
                   by = c("TF_name", "gene_name", "effect"), suffix = c("_10.6.3", "_10.7")) %>%
  dplyr::full_join(network_set_10.8 %>% dplyr::select(TF_name, gene_name, effect, evidence, confidence), 
                   by = c("TF_name", "gene_name", "effect"), suffix = c("_10.7", "_10.8")) %>%
  dplyr::full_join(network_set_10.9 %>% dplyr::select(TF_name, gene_name, effect, evidence, confidence), 
                   by = c("TF_name", "gene_name", "effect"), suffix = c("_10.8", "_10.9")) %>%
  dplyr::full_join(network_set_11.0 %>% dplyr::select(TF_name, gene_name, effect, evidence, confidence), 
                   by = c("TF_name", "gene_name", "effect"), suffix = c("_10.9", "_11.0")) %>%
  dplyr::full_join(network_set_11.0.1 %>% dplyr::select(TF_name, gene_name, effect, evidence, evidence_function, confidence), 
                   by = c("TF_name", "gene_name", "effect"), suffix = c("_11.0", "_11.0.1")) %>%
  dplyr::arrange(TF_name, gene_name)
  # distinct()
  # dplyr::rename(confidence_11.0.2 = confidence, evidence_11.0.2 = evidence)

write.table(all_tf_gene, file = "all_tf_gene.tsv", quote = F, row.names = F, col.names = T, sep = "\t")

network_tf_tu.txt

Data

versions_nw <- setdiff(dir_versions, c("10.10", "11.0.2"))
for(v in versions_nw){
  assign(paste0("network_tu_set_", v), read.delim(paste0(dir_releases, "/", v, "/network_tf_tu.tsv"), comment.char = "#", header = T, stringsAsFactors = F, na.strings = "")%>% dplyr::mutate(version = paste0("v", v))) 
}

NB: additional TAB characters at the end of each line cause parsing issues

tf_tu_summary <- list()
for(v in versions_nw){
    tf_tu_summary[[paste0("v", v)]] <- data.frame(version = paste0("v", v), count = nrow(get(paste0("network_tu_set_", v))))
}
tf_tu_summary_df <- data.table::rbindlist(tf_tu_summary)
tf_gene_summary_df$version <- factor(tf_tu_summary_df$version, levels = paste0("v", versions_nw))


DT::datatable(tf_tu_summary_df, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))
tf_tu_num <- simple_bar(tf_tu_summary_df, "version", "count") +
  scale_fill_viridis(discrete = T) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "Release version", y = "Number of entries", title = "")

tf_tu_num

TF-TU pairs

tf_tu_pairs <- list()
for (v in versions_nw) {
  pairs <- get(paste0("network_tu_set_", v)) %>%
    rowwise() %>%
    mutate(TF_TU = paste0(TF_name, "_", TU_name))
    tf_tu_pairs[[paste0("v", v)]] <- unique(pairs$TF_TU)
}
UpSetR::upset(fromList(tf_tu_pairs), sets = paste0("v", versions_nw), order.by = "freq", keep.order = T,
              text.scale = c(2,2,2,2,2,2),
              # queries = list(list(query = intersects, params = list("v10.6", "v10.6.3", "v10.7"), color = "red", active = T),
              #                list(query = intersects, params = list("v10.9", "v10.10", "v11.0", "v11.0.1", "v11.0.2"), color = "blue", active = T))
              )

Duplicate TF-TU pairs

pairs_tu_ids_dupli <- list()
for (v in versions_nw) {
    pairs_tu_ids_dupli[[paste0("v", v)]] <- get(paste0("network_tu_set_", v)) %>% 
    rowwise() %>%
    mutate(TF_TU = paste0(TF_name, "_", TU_name)) %>%
      group_by(TF_TU) %>% 
      summarise(occurrences = n()) %>% 
      group_by(occurrences) %>% 
      summarise(pairs_number = n()) %>%
      mutate(version = paste0("v", v))
}

pairs_tu_ids_dupli_df <- data.table::rbindlist(pairs_tu_ids_dupli) %>%
  dplyr::mutate(occurrences = factor(occurrences))

pairs_tu_ids_dupli_df$version <- factor(pairs_tu_ids_dupli_df$version, levels = paste0("v", versions_nw))

###
dodge <- ggplot(pairs_tu_ids_dupli_df, aes(fill = occurrences, y = pairs_number, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14)) +
  labs(x = "", y = "Number of pairs", title = "TF-TU pairs duplication in network_tf_gene.txt across versions")

dodge 

Evidence in network TF-TU

NB: evidence is formatted in different ways between versions of downloadable file, which can cause parsing issues

tf_tu_evidence <- bind_rows(network_tu_set_10.7, network_tu_set_10.8, network_tu_set_10.9, network_tu_set_11.0) %>%
  tidyr::separate_rows(evidence, sep = ",") %>%
  dplyr::mutate(evidence_code = gsub("\\[|\\]", "", evidence)) 

last <- network_tu_set_11.0.1 %>%
  dplyr::mutate(evidence_function = gsub("\\[\\]", "\\[\\|\\|\\]", evidence_function)) %>%
  dplyr::mutate(evidence = gsub("\\[\\]", "\\[\\|\\|\\]", evidence)) %>%
  dplyr::rowwise() %>%
  dplyr::mutate(evidence = paste0(evidence, ",", evidence_function)) %>%
  tidyr::separate_rows(evidence, sep = ",") %>%
  dplyr::mutate(evidence = gsub("\\[|\\]", "", evidence)) %>%
  tidyr::separate(evidence, c("evidence_code", "evidence_level", "evidence_name"), sep = "\\|") 
  # dplyr::mutate(fevidence = gsub("\\[|\\]", "", evidence_function)) %>%
  # tidyr::separate(fevidence, c("fevidence_code", "fevidence_level", "fevidence_name"), sep = "\\|")

tf_tu_evidence <- bind_rows(tf_tu_evidence, last)%>%
  dplyr::mutate(evidence_code = trimws(evidence_code)) 

tf_tu_evidence$version <- factor(tf_tu_evidence$version, levels = paste0("v", versions_nw))

##
tf_tu_evidence_long <- tf_tu_evidence %>% 
  group_by(version, evidence_code) %>% 
  summarise(value = n()) 

dodge <- ggplot(tf_tu_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_code)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of entries", title = "")

stack <- ggplot(tf_tu_evidence_long, aes(fill = evidence_code, y = value, x = version, group = evidence_code)) + 
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of evidence", title = "")

line <-  ggplot(tf_tu_evidence_long, aes(y = value, x = version, group = evidence_code)) + 
  geom_line(aes(color = evidence_code)) +
  geom_point(size = 2, aes(color = evidence_code)) +
  scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "Release version", y = "Number of entries", title = "")

fig0 <- ggplotly(tf_tu_num)
fig1<-ggplotly(dodge)
fig2<-ggplotly(stack)
fig3<-ggplotly(line)

subplot(fig0, fig1, fig2, fig3, nrows=4, shareY = TRUE)

Confidence

NB: confidence is now written in the downloadable file in all caps, which can cause mapping issues

OJO confidence evolution is weired compared to tf-gene and TFBSseeet

confidence_nw_all_versions <- list()
for(v in versions_nw){
  confidence_nw_all_versions[[v]] <- get(paste0("network_set_", v)) %>% 
    group_by(version, confidence) %>% 
    summarise(value = n())
}
confidence_nw_all_versions_df <- data.table::rbindlist(confidence_nw_all_versions) %>%
  dplyr::mutate(confidence = tolower(confidence)) 
confidence_nw_all_versions_df$confidence <- factor(confidence_nw_all_versions_df$confidence, levels = c("weak", "strong", "confirmed"))
confidence_nw_all_versions_df$version <- factor(confidence_nw_all_versions_df$version, levels = paste0("v", versions_nw))

##
tf_gene_confidence_long <- confidence_nw_all_versions_df

dodge <- ggplot(tf_gene_confidence_long, aes(fill = confidence, y = value, x = version)) + 
  geom_bar(position = position_dodge(preserve = "single"), stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "right") +
  labs(x = "", y = "Number of entries", title = "")

stack <- ggplot(tf_gene_confidence_long, aes(fill = confidence, y = value, x = version)) + 
  geom_bar(position = "stack", stat = "identity") +
  scale_fill_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "", y = "Number of TFBS", title = "")

line <-  ggplot(tf_gene_confidence_long, aes(group = confidence, y = value, x = version)) + 
  geom_line(aes(color = confidence)) +
  geom_point(size = 2, aes(color = confidence)) +
  scale_color_viridis(discrete = T, na.value = "gray", drop = F) +
  theme_minimal() +
  theme(axis.text = element_text(size = 14), axis.title.y = element_text(size = 14), title = element_text(size = 14), legend.position = "none") +
  labs(x = "Release version", y = "Number of entries", title = "")

fig0 <- ggplotly(tf_gene_num)
fig1<-ggplotly(dodge)
fig2<-ggplotly(stack)
fig3<-ggplotly(line)

subplot(fig0, fig1, fig2, fig3, nrows=4, shareY = T)
confidence_nw_summary <- confidence_nw_all_versions_df %>%
  arrange(confidence) %>%
  mutate(confidence = ifelse(is.na(confidence), "null", as.character(confidence))) %>%
  # mutate(value = ifelse(is.na(value), 0, value)) %>%
  data.frame() %>%
  pivot_wider(names_from = version, values_from = c(value)) %>%
  mutate(across(starts_with('v'), ~replace_na(.,0))) %>%
  bind_rows(summarise(.,
                      across(where(is.numeric), sum),
                      across(where(is.character), ~"total")))

DT::datatable(confidence_nw_summary, rownames= FALSE, options = list(searching = FALSE, lengthChange = FALSE, pageLength = 10))

Other

save.image(file = paste0("Binding_dataset_report.Rdata"))